Loading packages
Loading the data:
model_4_21_95wRAW = NULL
model_2_7_21_new = NULL
model_2_7_21_new <- read_excel("~/Desktop/sage/model_2_7_21_newV4.xlsx")
model_4_21_95wRAW <- model_2_7_21_new
#library(tidyverse)
#model_4_21_95wRAW %>% select_if(negate(is.numeric))
dim(model_2_7_21_new)
[1] 2077 87
summPreds <- function(inpPred,inpTruth,inpMetrNms=c("err","acc","sens","spec")) {
retVals <- numeric()
for ( metrTmp in inpMetrNms ) {
retVals[metrTmp] <- performance(prediction(inpPred,inpTruth),measure=metrTmp)@y.values[[1]][2]
}
retVals
}
Train = NULL
Test = NULL
bTrain = sample(1:nrow(model_4_21_95wRAW),round(0.7*nrow(model_4_21_95wRAW)))
Train <- model_4_21_95wRAW[bTrain,2:87]
Test <- model_4_21_95wRAW[-bTrain,2:87]
Test = as.data.frame(Test)
Train$cat = as.numeric(as.factor(Train$cat))
Test$cat = as.numeric(as.factor(Test$cat))
Y = factor(as.numeric(as.factor(Train$cat)))
tuneRF(Train[1:85],Y,ntreeTry = 300,plot=TRUE)
mtry = 9 OOB error = 0.96%
Searching left ...
mtry = 5 OOB error = 1.44%
-0.5 0.05
Searching right ...
mtry = 18 OOB error = 0.41%
0.5714286 0.05
mtry = 36 OOB error = 0.55%
-0.3333333 0.05
mtry OOBError
5.OOB 5 0.014442916
9.OOB 9 0.009628611
18.OOB 18 0.004126547
36.OOB 36 0.005502063

Y = factor(as.numeric(as.factor(Train$cat)))
rfTmp <- randomForest(Y~.,data=Train[,1:85],mTry = 18,ntree = 1000)
rfTestPred <- predict(rfTmp,newdata=Test[,1:85])
mseTest <- mean((as.numeric(factor(Test$cat))-as.numeric(rfTestPred))) #Calculating the mean squared error
#tmpVals <- summPreds(as.numeric(factor(Test$cat)),as.numeric(rfTestPred))
VI_F=importance(rfTmp)
VI_F
MeanDecreaseGini
LNTcon1 7.6357064
LNTcon2 7.6823166
LNTcon3 6.1456542
LNTfh1 3.3675898
LNTfh2 3.6810801
LNTfh3 3.7280283
LNTreg1 45.3698007
LNTreg2 26.8094780
LNTreg3 34.0956535
LNTfr1 47.1328080
LNTfr2 49.9809364
LNTfr3 72.5765562
SpleenTcon1 3.6301799
SpleenTcon2 5.0774485
SpleenTcon3 2.7495229
SpleenTfh1 1.7927661
SpleenTfh2 1.7451146
SpleenTfh3 3.2312894
SpleenTreg1 10.9382510
SpleenTreg2 16.5781411
SpleenTreg3 18.3344852
SpleenTfr1 12.5759806
SpleenTfr2 12.9644749
SpleenTfr3 8.4423328
BloodTcon1 8.8024146
BloodTcon2 8.4828076
BloodTcon3 7.2863457
BloodTfh1 1.4808955
BloodTfh2 1.5113150
BloodTfh3 1.4598444
BloodTreg1 5.1066351
BloodTreg2 6.0971699
BloodTreg3 7.1942529
BloodTfr1 1.0420599
BloodTfr2 0.9905301
BloodTfr3 1.0568475
EZH2EXP_F_p_Treg_1 4.5791186
EZH2EXP_F_p_Treg_2 2.0443356
EZH2EXP_F_p_Treg_3 2.1459223
EZH2EXP_F_p_Tfr_1 7.8041261
EZH2EXP_F_p_Tfr_2 10.2892409
EZH2EXP_F_p_Tfr_3 10.9562897
BlimpEXP_F_p_Treg_1 4.5235946
BlimpEXP_F_p_Treg_2 3.5205116
BlimpEXP_F_p_Treg_3 2.1943278
BlimpEXP_F_p_Treg_4 3.3664066
BlimpEXP_F_p_Tfr_1 2.3628381
BlimpEXP_F_p_Tfr_2 4.9722807
BlimpEXP_F_p_Tfr_3 2.6237975
BlimpEXP_F_p_Tfr_4 2.1307726
FoxDownFoxP_Tfr_1 3.7171745
FoxDownFoxP_Tfr_2 2.0298604
FoxDownFoxP_Tfr_3 2.3420473
FoxDownFoxMTfr_1 1.4743007
FoxDownFoxMTfr_2 1.3139286
FoxDownFoxMTfr_3 1.1419481
FoxDown_Tfh_1 5.2228072
FoxDown_Tfh_2 15.1654368
FoxDown_Tfh_3 3.5967246
HDM_OVA_Tcon_1 9.7500222
HDM_OVA_Tcon_2 6.9236944
HDM_OVA_Tfh_1 1.3556755
HDM_OVA_Tfh_2 1.9274904
HDM_OVA_Tfr_1 1.4817007
HDM_OVA_Tfr_2 1.3912696
HDM_Tfh_1 1.7202721
HDM_Tfh_2 1.3921791
HDM_Tfr_1 2.5219640
HDM_Tfr_2 6.9118782
FoxF_CreN_Tcon_1 10.5643154
FoxF_CreN_Tcon_2 6.3363880
FoxF_CreN_Tcon_3 2.9943886
FoxF_CreN_Tcon_4 3.6819194
FoxF_CreN_Treg_1 2.2946092
FoxF_CreN_Treg_2 4.4240561
FoxF_CreN_Treg_3 2.0410392
FoxF_CreN_Treg_4 4.2720484
FoxF_CreN_ICOS_1 22.9569124
FoxF_CreN_ICOS_2 16.9067668
FoxF_CreN_ICOS_3 13.8311513
FoxF_CreN_ICOS_4 26.2319491
FoxF_CreN_Tfr_1 3.7286656
FoxF_CreN_Tfr_2 1.4987874
FoxF_CreN_Tfr_3 10.1542357
FoxF_CreN_Tfr_4 2.9102014
#pdf("Importance_plot.pdf")
varImpPlot(rfTmp,type=2)

#dev.off()
# LNTreg1 43.447022
# LNTreg2 27.107419
# LNTreg3 32.783442
# LNTfr1 53.615951
# LNTfr2 52.316917
# LNTfr3 67.765825
# FoxF_CreN_ICOS_4 29.519462
# pdf("Importance_plot.pdf")
# partialPlot(rfTmp,x.var="LNTfr3",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTfr1",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTfr2",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg1",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg3",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg2",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="FoxF_CreN_ICOS_4",pred.data = as.data.frame(Train),which.class = 2)
# dev.off()
partialPlot(rfTmp,x.var="BloodTfr1",pred.data = as.data.frame(Train),which.class = 2)

partialPlot(rfTmp,x.var="BloodTfr2",pred.data = as.data.frame(Train),which.class = 2)

partialPlot(rfTmp,x.var="BloodTfr3",pred.data = as.data.frame(Train),which.class = 2)

#
partialPlot(rfTmp,x.var="BloodTfr1",pred.data = as.data.frame(Train),which.class = 1)

partialPlot(rfTmp,x.var="BloodTfr2",pred.data = as.data.frame(Train),which.class = 1)

partialPlot(rfTmp,x.var="BloodTfr3",pred.data = as.data.frame(Train),which.class = 1)

#library(dplyr)
library(RColorBrewer)
imp <- as.data.frame(VI_F)
imp$varnames <- rownames(VI_F)
imp = imp %>% arrange(MeanDecreaseGini)
small = imp[65:85,]
#imp = order(imp)
dotchart(small$MeanDecreaseGini,labels = small$varnames,color = brewer.pal(n = 8, name = "Dark2"))

ggplot(data = small, aes(x = MeanDecreaseGini,y = reorder(varnames, MeanDecreaseGini))) +
geom_bar(stat="identity",fill = 'darkred')+
labs(y = "Condition") +
theme(legend.position = "top")

p = ggplot(small, aes(x = MeanDecreaseGini, y = reorder(varnames, MeanDecreaseGini))) +
geom_point(aes(color = MeanDecreaseGini, size = MeanDecreaseGini), alpha = 0.5)
p
ggsave(file="test.svg", plot=p, width=10, height=8)

library(readxl)
TfhTfr_Exon_AllGenes_combined <- read_excel("~/Desktop/sage/TfhTfr_Exon_AllGenes_combined.xlsx")
#Getting the class
#rfTestPred <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86])
rfTestPred <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86],type = "class")
#getting the probabilities
rfTestPred_prob <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86],type = "prob")
#Here I am trying to get some random names for the GSEA gene set.
random_names = character(2000)
random_names = sample(TfhTfr_Exon_AllGenes_combined$`Feature ID`,2000)
fileConn<-file("random_names.txt")
writeLines(random_names, fileConn)
close(fileConn)
2 is a Yes, 1 is a No
TfhTfr_Exon_AllGenes_combined = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred)
genes_vales = TfhTfr_Exon_AllGenes_combined[,c(1,87)]
genes_vales = cbind(genes_vales,rfTestPred_prob)
hist(as.numeric(genes_vales[,2]))

Here I generate an image of the final tree from the training/test data
#rfTestPred_prob
options(repos='http://cran.rstudio.org')
have.packages <- installed.packages()
cran.packages <- c('devtools','plotrix','randomForest','tree')
to.install <- setdiff(cran.packages, have.packages[,1])
if(length(to.install)>0) install.packages(to.install)
library(devtools)
Loading required package: usethis
if(!('reprtree' %in% installed.packages())){
install_github('araastat/reprtree')
}
for(p in c(cran.packages, 'reprtree')) eval(substitute(library(pkg), list(pkg=p)))
Attaching package: ‘plotrix’
The following object is masked from ‘package:psych’:
rescale
library(reprtree)
svg("TREE.svg", width=20, height=20,pointsize = 5)
reprtree:::plot.getTree(rfTmp)
dev.off()
null device
1
q-q plots of the p-values
qqnorm(rfTestPred_prob, pch = 1, frame = FALSE)
qqline(rfTestPred_prob, col = "steelblue", lwd = 2)

#genes_vales = cbind(genes_vales,rfTestPred_prob)
Yes_columns = NULL
Yes_columns = dplyr::filter( genes_vales, rfTestPred == 2 )
Yes_columns = data.frame(Yes_columns)
Yes_columns = Yes_columns[order(Yes_columns$X2,decreasing = TRUE),]
Yes_columns_sub = Yes_columns[1:2000,]
#View(Yes_columns_sub)
TfhTfr_Exon_AllGenes_combined_Yes = NULL
TfhTfr_Exon_AllGenes_combined_Yes = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred_prob)
TfhTfr_Exon_AllGenes_combined_Yes = dplyr::filter( TfhTfr_Exon_AllGenes_combined_Yes, rfTestPred == 2 )
TfhTfr_Exon_AllGenes_combined_Yes = data.frame(TfhTfr_Exon_AllGenes_combined_Yes)
TfhTfr_Exon_AllGenes_combined_Yes = TfhTfr_Exon_AllGenes_combined_Yes[order(TfhTfr_Exon_AllGenes_combined_Yes$X2,decreasing = TRUE),]
TfhTfr_Exon_AllGenes_combined_Yes_top_2000 = TfhTfr_Exon_AllGenes_combined_Yes[2:2000,]
library(pheatmap)
#small heatmap
genesList = c('Cxcr5','Icos','Pdcd1','Bcl6','Nrn1','Tnfrsf18','Gzmb','Sh2d1a','Prdm1','Cd44','Irf4','Ezh2','Il1r2','Cxcr7','Foxp3','Bcl6','Il2rg','Il2rb','Bcl6','Ctla4','Gata3','Id2','Il10','Maf')
colsToUse = 8:13
colsToUse = c(1,colsToUse)
McolsToUse = c(colsToUse,20:25) # Just Spleen
McolsToUse = c(McolsToUse,32:37) # Blood included
Tcon_McolsToUse = c(McolsToUse,2:4,14:16,26:28) # Tcon included
subsetYes = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,colsToUse]
Copy_subsetYes = as.matrix(subsetYes[,2:7])
rownames(Copy_subsetYes) = subsetYes[,1]
p = pheatmap(Copy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p

ggsave(file="subListYesGenes.svg", plot=p, width=10, height=8)
#with more cell types
More_cell_types = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,McolsToUse]
#MCopy_subsetYes = as.matrix(More_cell_types[,2:13]) # with just spleen
MCopy_subsetYes = as.matrix(More_cell_types[,2:19]) #Blood included
colsReorder = c(1:3,7:9,13:15,4:6,10:12,16:18)
MCopy_subsetYes_reOrder = MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(MCopy_subsetYes) = More_cell_types[,1]
rownames(MCopy_subsetYes_reOrder) = More_cell_types[,1]
View(MCopy_subsetYes_reOrder)
p = pheatmap(MCopy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p

RC_p = pheatmap(MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
RC_p

ggsave(file="Proper_order_subListYesGenes_cell_types.svg", plot=RC_p, width=10, height=8)
###############
#adding Tcon
T_con_More_cell_types = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,Tcon_McolsToUse]
#MCopy_subsetYes = as.matrix(More_cell_types[,2:13]) # with just spleen
T_con_MCopy_subsetYes = as.matrix(T_con_More_cell_types[,2:28]) #Tcon included
colsReorder = c(1:6,19:21,7:12,22:24,13:18,25:27)
T_con_MCopy_subsetYes_reOrder = T_con_MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(T_con_MCopy_subsetYes) = More_cell_types[,1]
rownames(T_con_MCopy_subsetYes_reOrder) = More_cell_types[,1]
View(T_con_MCopy_subsetYes_reOrder)
T_con_RC_p = pheatmap(T_con_MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
T_con_RC_p

ggsave(file="Proper_order_subListYesGenes_cell_types_T_con_.svg", plot=T_con_RC_p, width=8, height=6)
##########
#large heatmap
holder = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,8:13]
#View(holder)
holder = as.matrix(holder)
rownames(holder) = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,1]
heatmap(holder)

p = pheatmap(holder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p

ggsave(file="test.svg", plot=p, width=10, height=8)
#large Top 50
holder = NULL
holder = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,McolsToUse]
#View(holder)
holder = as.matrix(holder[,2:19])
rownames(holder) = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,1]
heatmap(holder)

p = pheatmap(holder,cutree_rows = 2,cutree_cols = 8,scale = 'row',clustering_distance_cols = "canberra")
p

ggsave(file="test.svg", plot=p, width=10, height=8)
Here I am working on the hclust linkage to see if there is one that better fits the columns
MCopy_subsetYes = as.matrix(More_cell_types[,2:19]) #Blood included
colsReorder = c(1:3,7:9,13:15,4:6,10:12,16:18)
MCopy_subsetYes_reOrder = MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(MCopy_subsetYes) = More_cell_types[,1]
rownames(MCopy_subsetYes_reOrder) = More_cell_types[,1]
p = pheatmap(MCopy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row')
p

RC_p = pheatmap(MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
RC_p

#method
#"euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski".
# Ward Hierarchical Clustering
d <- dist(MCopy_subsetYes_reOrder, method = "euclidean") # distance matrix
fit_WD <- hclust(d, method="ward.D")
fit_WD2 <- hclust(d, method="ward.D2")
fit_S <- hclust(d, method="single")
fit_C <- hclust(d, method="complete")
fit_A <- hclust(d, method="average")
fit_MC <- hclust(d, method="mcquitty")
fit_M <- hclust(d, method="median")
fit_CE <- hclust(d, method="centroid")
library(gplots)
#rownames(TfhTfr_Exon_AllGenes_combined_Yes) = TfhTfr_Exon_AllGenes_combined_Yes[,2]
Ymatrix = unlist(TfhTfr_Exon_AllGenes_combined_Yes[1:5018,2:86])
Ymatrix = as.numeric(Ymatrix)
Ymatrix = matrix(v,nrow = 5018,ncol = 84)
heatmap(Ymatrix,symm = F,main = "Heatmap test",labRow = rownames(TfhTfr_Exon_AllGenes_combined_Yes))
Making the top calls for the other class.
TfhTfr_Exon_AllGenes_combined_No = NULL
TfhTfr_Exon_AllGenes_combined_No = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred_prob)
TfhTfr_Exon_AllGenes_combined_No = dplyr::filter( TfhTfr_Exon_AllGenes_combined_No, rfTestPred == 1 )
TfhTfr_Exon_AllGenes_combined_No = data.frame(TfhTfr_Exon_AllGenes_combined_No)
TfhTfr_Exon_AllGenes_combined_No = TfhTfr_Exon_AllGenes_combined_No[order(TfhTfr_Exon_AllGenes_combined_No$X1,decreasing = TRUE),]
TfhTfr_Exon_AllGenes_combined_No_top_1000 = TfhTfr_Exon_AllGenes_combined_No[1:1000,]
Looking at the top 50 from each class
TfhTfr_Exon_AllGenes_combined_Yes_top_50 = TfhTfr_Exon_AllGenes_combined_Yes[1:50,]
TfhTfr_Exon_AllGenes_combined_No_top_50 = TfhTfr_Exon_AllGenes_combined_No[1:50,]
groupCalls = rbind(TfhTfr_Exon_AllGenes_combined_Yes_top_50,TfhTfr_Exon_AllGenes_combined_No_top_50)
YNHeatmap_matrix = NULL
YNHeatmap_matrix = unlist(groupCalls[2:100,2:86])
YNHeatmap_matrix = as.numeric(v)
YNHeatmap_matrix = matrix(v,nrow = 98,ncol = 84)
heatmap(YNHeatmap_matrix,symm = F,main = "Heatmap test",labRow = rownames(groupCalls))
image(YNHeatmap_matrix)
write.csv(as.data.frame(Yes_columns_sub),file = "TFR_model_results_top_1000_genes.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_Yes),file = "TFR_model_results_Yes_genes.csv")
TfhTfr_Exon_AllGenes_combined_Yes_top_2000
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_Yes_top_2000),file = "TFR_model_results_Yes_genes_top_2000.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_No_top_1000),file = "TFR_model_results_No_genes_1000.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_No_top_1000),file = "TFR_model_results_No_genes_1000.csv")
write.csv(as.data.frame(groupCalls),file = "TFR_model_results_group.csv")
Not used
#BiocManager::install("M3C")
#library(M3C)
YmatrixNew = Ymatrix
#colnames(YmatrixNew) = colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:85])
umap(TfhTfr_Exon_AllGenes_combined_Yes[,2:86],colvec = as.factor(colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:86])),labels = as.factor(colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:86])))
#tfr.umap = umap(Ymatrix)
#tfr.umap
---
title: "R Notebook"
output: html_notebook
---

Loading packages

```{r message=FALSE, include=FALSE}
library(dplyr)
library(neuralnet)
library(readxl)
library(grid)
library(corrplot)
library(caret)
library(e1071)
library(ROCR)
library(ggplot2)
library(GGally)
library(PerformanceAnalytics)
library(factoextra)
library(corrplot)
library(Rtsne)
library(FactoMineR)
library(ggplot2)
library(factoextra)
library(survminer)
library(ggcorrplot)
library(readr)
library(circlize)
library(readxl)
library(stringr)
library(reshape)
library(psych)
library(ComplexHeatmap)
library(ggpubr)
library(readr)
library(gridExtra)
library(cowplot)
library(MASS)
library(fitdistrplus)
#BiocManager::install("preprocessCore")
library(preprocessCore)
library(MASS)
library(class)
library(ggplot2)
library(reshape2)
library(ROCR)
library(e1071)
library(GGally)
library(klaR)
library(randomForest)

```

Loading the data:


```{r}
model_4_21_95wRAW = NULL
model_2_7_21_new = NULL
model_2_7_21_new <- read_excel("~/Desktop/sage/model_2_7_21_newV4.xlsx")

model_4_21_95wRAW <- model_2_7_21_new
#library(tidyverse)
#model_4_21_95wRAW %>% select_if(negate(is.numeric))

dim(model_2_7_21_new)
```

```{r}
summPreds <- function(inpPred,inpTruth,inpMetrNms=c("err","acc","sens","spec")) {
  retVals <- numeric()
  for ( metrTmp in inpMetrNms ) {
    retVals[metrTmp] <- performance(prediction(inpPred,inpTruth),measure=metrTmp)@y.values[[1]][2]
  }
  retVals
}
```


```{r}
    Train = NULL
    Test = NULL
    bTrain = sample(1:nrow(model_4_21_95wRAW),round(0.7*nrow(model_4_21_95wRAW)))
    Train <- model_4_21_95wRAW[bTrain,2:87]
    Test <- model_4_21_95wRAW[-bTrain,2:87]
    Test = as.data.frame(Test)
              Train$cat = as.numeric(as.factor(Train$cat))
          Test$cat = as.numeric(as.factor(Test$cat))
          
          
```

```{r}
Y = factor(as.numeric(as.factor(Train$cat)))
tuneRF(Train[1:85],Y,ntreeTry = 300,plot=TRUE)
```



```{r echo=TRUE}
Y = factor(as.numeric(as.factor(Train$cat)))
rfTmp <- randomForest(Y~.,data=Train[,1:85],mTry = 18,ntree = 1000)
rfTestPred <- predict(rfTmp,newdata=Test[,1:85])


mseTest <- mean((as.numeric(factor(Test$cat))-as.numeric(rfTestPred))) #Calculating the mean squared error
#tmpVals <- summPreds(as.numeric(factor(Test$cat)),as.numeric(rfTestPred))

VI_F=importance(rfTmp)
VI_F
#pdf("Importance_plot.pdf")
varImpPlot(rfTmp,type=2)
#dev.off()

# LNTreg1                    43.447022
# LNTreg2                    27.107419
# LNTreg3                    32.783442
# LNTfr1                     53.615951
# LNTfr2                     52.316917
# LNTfr3                     67.765825
# FoxF_CreN_ICOS_4           29.519462
# pdf("Importance_plot.pdf")
# partialPlot(rfTmp,x.var="LNTfr3",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTfr1",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTfr2",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg1",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg3",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="LNTreg2",pred.data = as.data.frame(Train),which.class = 2)
# partialPlot(rfTmp,x.var="FoxF_CreN_ICOS_4",pred.data = as.data.frame(Train),which.class = 2)
# dev.off()
partialPlot(rfTmp,x.var="BloodTfr1",pred.data = as.data.frame(Train),which.class = 2)
partialPlot(rfTmp,x.var="BloodTfr2",pred.data = as.data.frame(Train),which.class = 2)
partialPlot(rfTmp,x.var="BloodTfr3",pred.data = as.data.frame(Train),which.class = 2)
# 
partialPlot(rfTmp,x.var="BloodTfr1",pred.data = as.data.frame(Train),which.class = 1)
partialPlot(rfTmp,x.var="BloodTfr2",pred.data = as.data.frame(Train),which.class = 1)
partialPlot(rfTmp,x.var="BloodTfr3",pred.data = as.data.frame(Train),which.class = 1)

#library(dplyr)
library(RColorBrewer)
imp <- as.data.frame(VI_F)
imp$varnames <- rownames(VI_F)
imp = imp %>% arrange(MeanDecreaseGini)
small = imp[65:85,]
#imp = order(imp)
dotchart(small$MeanDecreaseGini,labels = small$varnames,color = brewer.pal(n = 8, name = "Dark2"))


ggplot(data = small, aes(x = MeanDecreaseGini,y = reorder(varnames, MeanDecreaseGini))) +
geom_bar(stat="identity",fill = 'darkred')+
labs(y = "Condition") +
theme(legend.position = "top")



p = ggplot(small, aes(x = MeanDecreaseGini, y = reorder(varnames, MeanDecreaseGini))) +
  geom_point(aes(color = MeanDecreaseGini, size = MeanDecreaseGini), alpha = 0.5) 
  p
 ggsave(file="test.svg", plot=p, width=10, height=8)

```



```{r}
library(readxl)
TfhTfr_Exon_AllGenes_combined <- read_excel("~/Desktop/sage/TfhTfr_Exon_AllGenes_combined.xlsx")

#Getting the class
#rfTestPred <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86])
rfTestPred <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86],type = "class")
#getting the probabilities
rfTestPred_prob <- predict(rfTmp,newdata=TfhTfr_Exon_AllGenes_combined[,2:86],type = "prob")

#Here I am trying to get some random names for the GSEA gene set.
random_names = character(2000)
random_names = sample(TfhTfr_Exon_AllGenes_combined$`Feature ID`,2000)
fileConn<-file("random_names.txt")
writeLines(random_names, fileConn)
close(fileConn)
```

2 is a Yes, 1 is a No
```{r}
TfhTfr_Exon_AllGenes_combined = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred)
genes_vales = TfhTfr_Exon_AllGenes_combined[,c(1,87)]
genes_vales = cbind(genes_vales,rfTestPred_prob)

hist(as.numeric(genes_vales[,2]))
```

Here I generate an image of the final tree from the training/test data
```{r}
#rfTestPred_prob
options(repos='http://cran.rstudio.org')
have.packages <- installed.packages()
cran.packages <- c('devtools','plotrix','randomForest','tree')
to.install <- setdiff(cran.packages, have.packages[,1])
if(length(to.install)>0) install.packages(to.install)

library(devtools)
if(!('reprtree' %in% installed.packages())){
  install_github('araastat/reprtree')
}
for(p in c(cran.packages, 'reprtree')) eval(substitute(library(pkg), list(pkg=p)))
library(reprtree)

svg("TREE.svg",    width=20, height=20,pointsize = 5)
reprtree:::plot.getTree(rfTmp)
dev.off()

```

q-q plots of the p-values
```{r}
qqnorm(rfTestPred_prob, pch = 1, frame = FALSE)
qqline(rfTestPred_prob, col = "steelblue", lwd = 2)
```


```{r}
#genes_vales = cbind(genes_vales,rfTestPred_prob)
Yes_columns = NULL
Yes_columns = dplyr::filter( genes_vales, rfTestPred == 2 )
Yes_columns = data.frame(Yes_columns)
Yes_columns = Yes_columns[order(Yes_columns$X2,decreasing = TRUE),]
Yes_columns_sub = Yes_columns[1:2000,]
#View(Yes_columns_sub)


TfhTfr_Exon_AllGenes_combined_Yes = NULL
TfhTfr_Exon_AllGenes_combined_Yes = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred_prob)
TfhTfr_Exon_AllGenes_combined_Yes = dplyr::filter( TfhTfr_Exon_AllGenes_combined_Yes, rfTestPred == 2 )
TfhTfr_Exon_AllGenes_combined_Yes = data.frame(TfhTfr_Exon_AllGenes_combined_Yes)
TfhTfr_Exon_AllGenes_combined_Yes = TfhTfr_Exon_AllGenes_combined_Yes[order(TfhTfr_Exon_AllGenes_combined_Yes$X2,decreasing = TRUE),]
TfhTfr_Exon_AllGenes_combined_Yes_top_2000 = TfhTfr_Exon_AllGenes_combined_Yes[2:2000,]

```

```{r}
library(pheatmap)

#small heatmap
genesList = c('Cxcr5','Icos','Pdcd1','Bcl6','Nrn1','Tnfrsf18','Gzmb','Sh2d1a','Prdm1','Cd44','Irf4','Ezh2','Il1r2','Cxcr7','Foxp3','Bcl6','Il2rg','Il2rb','Bcl6','Ctla4','Gata3','Id2','Il10','Maf')


colsToUse = 8:13
colsToUse = c(1,colsToUse)
McolsToUse = c(colsToUse,20:25) # Just Spleen
McolsToUse = c(McolsToUse,32:37) # Blood included
Tcon_McolsToUse = c(McolsToUse,2:4,14:16,26:28) # Tcon included

subsetYes = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,colsToUse]

Copy_subsetYes = as.matrix(subsetYes[,2:7])
rownames(Copy_subsetYes) = subsetYes[,1]
p = pheatmap(Copy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p
ggsave(file="subListYesGenes.svg", plot=p, width=10, height=8)

#with more cell types
More_cell_types = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,McolsToUse]
#MCopy_subsetYes = as.matrix(More_cell_types[,2:13]) # with just spleen
MCopy_subsetYes = as.matrix(More_cell_types[,2:19]) #Blood included
colsReorder = c(1:3,7:9,13:15,4:6,10:12,16:18)
MCopy_subsetYes_reOrder = MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(MCopy_subsetYes) = More_cell_types[,1]
rownames(MCopy_subsetYes_reOrder) = More_cell_types[,1]
View(MCopy_subsetYes_reOrder)
p = pheatmap(MCopy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p

RC_p = pheatmap(MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
RC_p
ggsave(file="Proper_order_subListYesGenes_cell_types.svg", plot=RC_p, width=10, height=8)

###############

#adding Tcon
T_con_More_cell_types = TfhTfr_Exon_AllGenes_combined_Yes[TfhTfr_Exon_AllGenes_combined_Yes$Feature.ID %in% genesList,Tcon_McolsToUse]
#MCopy_subsetYes = as.matrix(More_cell_types[,2:13]) # with just spleen
T_con_MCopy_subsetYes = as.matrix(T_con_More_cell_types[,2:28]) #Tcon included
colsReorder = c(1:6,19:21,7:12,22:24,13:18,25:27)
T_con_MCopy_subsetYes_reOrder = T_con_MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(T_con_MCopy_subsetYes) = More_cell_types[,1]
rownames(T_con_MCopy_subsetYes_reOrder) = More_cell_types[,1]
View(T_con_MCopy_subsetYes_reOrder)

T_con_RC_p = pheatmap(T_con_MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
T_con_RC_p
ggsave(file="Proper_order_subListYesGenes_cell_types_T_con_.svg", plot=T_con_RC_p, width=8, height=6)

##########
#large heatmap
holder = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,8:13]
#View(holder)
holder = as.matrix(holder)

rownames(holder) = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,1]


heatmap(holder)

p = pheatmap(holder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
p
ggsave(file="test.svg", plot=p, width=10, height=8)

#large Top 50
holder = NULL
holder = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,McolsToUse]
#View(holder)
holder = as.matrix(holder[,2:19])

rownames(holder) = TfhTfr_Exon_AllGenes_combined_Yes_top_2000[1:50,1]


heatmap(holder)

p = pheatmap(holder,cutree_rows = 2,cutree_cols = 8,scale = 'row',clustering_distance_cols = "canberra")
p
ggsave(file="test.svg", plot=p, width=10, height=8)

```

Here I am working on the hclust linkage to see if there is one that better fits the columns
```{r}
MCopy_subsetYes = as.matrix(More_cell_types[,2:19]) #Blood included
colsReorder = c(1:3,7:9,13:15,4:6,10:12,16:18)
MCopy_subsetYes_reOrder = MCopy_subsetYes[,colsReorder] #re-ordered cols
rownames(MCopy_subsetYes) = More_cell_types[,1]
rownames(MCopy_subsetYes_reOrder) = More_cell_types[,1]

p = pheatmap(MCopy_subsetYes,cutree_rows = 2,cutree_cols = 2,scale = 'row')
p

RC_p = pheatmap(MCopy_subsetYes_reOrder,cutree_rows = 2,cutree_cols = 2,scale = 'row',clustering_distance_cols = "canberra")
RC_p

#method	
 #"euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski". 

# Ward Hierarchical Clustering
d <- dist(MCopy_subsetYes_reOrder, method = "euclidean") # distance matrix
fit_WD <- hclust(d, method="ward.D") 
fit_WD2 <- hclust(d, method="ward.D2") 
fit_S <- hclust(d, method="single") 
fit_C <- hclust(d, method="complete") 
fit_A <- hclust(d, method="average") 
fit_MC <- hclust(d, method="mcquitty") 
fit_M <- hclust(d, method="median")
fit_CE <- hclust(d, method="centroid") 



```




```{r}

library(gplots)
#rownames(TfhTfr_Exon_AllGenes_combined_Yes) = TfhTfr_Exon_AllGenes_combined_Yes[,2]
Ymatrix = unlist(TfhTfr_Exon_AllGenes_combined_Yes[1:5018,2:86])
Ymatrix = as.numeric(Ymatrix)
Ymatrix = matrix(v,nrow = 5018,ncol = 84)

heatmap(Ymatrix,symm = F,main = "Heatmap test",labRow = rownames(TfhTfr_Exon_AllGenes_combined_Yes))
```

Making the top calls for the other class.

```{r}
TfhTfr_Exon_AllGenes_combined_No = NULL
TfhTfr_Exon_AllGenes_combined_No = cbind(TfhTfr_Exon_AllGenes_combined,rfTestPred_prob)
TfhTfr_Exon_AllGenes_combined_No = dplyr::filter( TfhTfr_Exon_AllGenes_combined_No, rfTestPred == 1 )
TfhTfr_Exon_AllGenes_combined_No = data.frame(TfhTfr_Exon_AllGenes_combined_No)
TfhTfr_Exon_AllGenes_combined_No = TfhTfr_Exon_AllGenes_combined_No[order(TfhTfr_Exon_AllGenes_combined_No$X1,decreasing = TRUE),]
TfhTfr_Exon_AllGenes_combined_No_top_1000 = TfhTfr_Exon_AllGenes_combined_No[1:1000,]

```

Looking at the top 50 from each class
```{r}

TfhTfr_Exon_AllGenes_combined_Yes_top_50 = TfhTfr_Exon_AllGenes_combined_Yes[1:50,]
TfhTfr_Exon_AllGenes_combined_No_top_50 = TfhTfr_Exon_AllGenes_combined_No[1:50,]


groupCalls = rbind(TfhTfr_Exon_AllGenes_combined_Yes_top_50,TfhTfr_Exon_AllGenes_combined_No_top_50)

YNHeatmap_matrix = NULL
YNHeatmap_matrix = unlist(groupCalls[2:100,2:86])
YNHeatmap_matrix = as.numeric(v)
YNHeatmap_matrix = matrix(v,nrow = 98,ncol = 84)

heatmap(YNHeatmap_matrix,symm = F,main = "Heatmap test",labRow = rownames(groupCalls))
image(YNHeatmap_matrix)

```


```{r}

write.csv(as.data.frame(Yes_columns_sub),file = "TFR_model_results_top_1000_genes.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_Yes),file = "TFR_model_results_Yes_genes.csv")
TfhTfr_Exon_AllGenes_combined_Yes_top_2000
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_Yes_top_2000),file = "TFR_model_results_Yes_genes_top_2000.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_No_top_1000),file = "TFR_model_results_No_genes_1000.csv")
write.csv(as.data.frame(TfhTfr_Exon_AllGenes_combined_No_top_1000),file = "TFR_model_results_No_genes_1000.csv")
write.csv(as.data.frame(groupCalls),file = "TFR_model_results_group.csv")
```

Not used
```{r}
#BiocManager::install("M3C")
#library(M3C)

YmatrixNew = Ymatrix
#colnames(YmatrixNew) = colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:85])
umap(TfhTfr_Exon_AllGenes_combined_Yes[,2:86],colvec = as.factor(colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:86])),labels = as.factor(colnames(TfhTfr_Exon_AllGenes_combined_Yes[,2:86])))

#tfr.umap = umap(Ymatrix)
#tfr.umap
```

